#This script plots the cosine similarity at the article level for Egypt
#It plots the cosine similarity calculated for individual articles versus human scores
#It does this as a scatter plot and as an over-time trend
#It produces Figure J2 in the paper
library(dplyr)
library(tidyr)
library(purrr)
library(ggplot2)
library(ggthemes)
library(cowplot)
library(tsibble)
library(readr)

# ------------------------------------------------------------------
# Combine Human Labelled Data
# ------------------------------------------------------------------
responses_dir <- "data/qualtrics/completed_forms_cleaned"
nws_dir <- "data/qualtrics/valtest_samples"

# Read and combine all cleaned form files
all_responses <- list.files(responses_dir, full.names = TRUE) %>% 
  map_dfr(~ readRDS(.x))

# Read and combine all news sample files
all_nws_samples <- list.files(nws_dir, full.names = TRUE) %>% 
  map_dfr(~ readRDS(.x))

# Combine cosine similarity scores with human-labelled scores:
# - Join on "ID"
# - Remove rows with "ARTIDCHECK" in ID
# - Keep only the desired columns
daftar_responses <- all_responses %>%
  left_join(all_nws_samples, by = "ID") %>%
  filter(!grepl("ARTIDCHECK", ID)) %>%
  select(ID, score_avg, cos_sim)

saveRDS(daftar_responses, "data/qualtrics/labelled_data.rds")

# Add time information to the news samples using tsibble functions
all_nws_samples <- all_nws_samples %>%
  mutate(
    yearmon = tsibble::yearmonth(date),
    yearwk  = tsibble::yearweek(date)
  )

# ------------------------------------------------------------------
# Plot 1: Mean year-week cosine similarity vs. human scores
# ------------------------------------------------------------------
g5 <- all_responses %>%
  left_join(all_nws_samples, by = "ID") %>%
  filter(score_avg < 10) %>%  # remove NA or out-of-range responses
  group_by(yearwk) %>%
  summarise(
    score_avg = mean(score_avg),
    cos_sim   = mean(cos_sim),
    .groups   = "drop"
  ) %>%
  ggplot(aes(x = cos_sim, y = score_avg)) +
  geom_jitter(aes(color = yearwk), height = 1.5, width = 0.1, alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  theme_tufte(base_family = "Helvetica") +
  labs(
    x = "Mean year-week cosine similarity score",
    y = "Mean year-week scores by human labellers"
  ) +
  theme(
    axis.text = element_text(size = 20),
    axis.title = element_text(size = 20),
    panel.border = element_rect(colour = "black", fill = NA, size = 1),
    plot.background = element_rect(fill = "white", colour = NA),
    panel.grid.major = element_line(size = 0.1, linetype = "solid"),
    panel.grid.minor = element_line(size = 0.1, linetype = "solid"),
    strip.text = element_text(size = 20)
  )

# ------------------------------------------------------------------
# Plot 2: Over-time trend of mean human scores (by year-week)
# ------------------------------------------------------------------
g6 <- all_responses %>%
  left_join(all_nws_samples, by = "ID") %>%
  filter(score_avg < 10) %>% 
  group_by(yearwk) %>%
  summarise(
    score_avg = mean(score_avg),
    .groups = "drop"
  ) %>%
  ggplot(aes(x = as.Date(yearwk), y = score_avg)) +
  geom_jitter(color = "darkblue", alpha = 0.7) +
  geom_smooth(method = "loess", se = FALSE, color = "red") +
  theme_tufte(base_family = "Helvetica") +
  labs(
    x = "Year-week",
    y = "Average article scores by human labellers"
  ) +
  theme(
    axis.text = element_text(size = 20),
    axis.title = element_text(size = 20),
    panel.border = element_rect(colour = "black", fill = NA, size = 1),
    plot.background = element_rect(fill = "white", colour = NA),
    panel.grid.major = element_line(size = 0.1, linetype = "solid"),
    panel.grid.minor = element_line(size = 0.1, linetype = "solid"),
    strip.text = element_text(size = 20)
  )

# ------------------------------------------------------------------
# Combine and save the plots
# ------------------------------------------------------------------
png("plots/figJ2.png", width = 300, height = 175, units = "mm", res = 300)
plot_grid(g5, g6, nrow = 1, labels = "AUTO")
dev.off()